library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(tidyverse)
library(stringr)
library(caret)
library(fpp2)
library(dygraphs)
library(xts)
library(pander)
library(purrr)
library(ggthemes)
library(gridExtra)
library(cowplot)
library(RColorBrewer)
library(gplots)
library(corrplot) 
library(functional)
library(fastNaiveBayes)
library(ggpubr)

Loading the data

dfdata<- readr::read_csv("CaseStudy2-data.csv")
head(dfdata)

Atrittion calculation

dfdata %>% count(Attrition) ->att
att

Exploratory plot for Attrition

You can also embed plots, for example:

agePlot <- ggplot(dfdata,aes(Age,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
agePlot

travelPlot <- ggplot(dfdata,aes(BusinessTravel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
travelPlot

ratePlot <- ggplot(dfdata,aes(DailyRate, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
ratePlot

depPlot <- ggplot(dfdata,aes(Department,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
depPlot

distPlot<- ggplot(dfdata,aes(DistanceFromHome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
distPlot

eduPlot <- ggplot(dfdata,aes(Education,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
eduPlot

edufieldPlot <- ggplot(dfdata,aes(EducationField,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
edufieldPlot

envPlot <- ggplot(dfdata,aes(EnvironmentSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
envPlot

genPlot <- ggplot(dfdata,aes(Gender,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
genPlot

hourlyPlot <- ggplot(dfdata,aes(HourlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
hourlyPlot

jobInvPlot <- ggplot(dfdata,aes(JobInvolvement,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #important
jobInvPlot

jobLevelPlot <- ggplot(dfdata,aes(JobLevel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobLevelPlot

jobSatPlot <- ggplot(dfdata,aes(JobSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobSatPlot

overTimePlot <- ggplot(dfdata,aes(OverTime,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
overTimePlot

hikePlot <- ggplot(dfdata,aes(PercentSalaryHike, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
hikePlot

perfPlot <- ggplot(dfdata,aes(PerformanceRating,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
perfPlot

RelSatPlot <- ggplot(dfdata,aes(RelationshipSatisfaction,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
RelSatPlot

StockPlot <- ggplot(dfdata,aes(StockOptionLevel,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
StockPlot

workingYearsPlot <- ggplot(dfdata,aes(TotalWorkingYears,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
workingYearsPlot

TrainTimesPlot <- ggplot(dfdata,aes(TrainingTimesLastYear,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
TrainTimesPlot

 WLBPlot<- ggplot(dfdata,aes(WorkLifeBalance,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
  WLBPlot

marPlot <- ggplot(dfdata,aes(MaritalStatus,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")#maybe
marPlot

monthlyIncPlot <- ggplot(dfdata,aes(MonthlyIncome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
monthlyIncPlot

monthlyRatePlot <- ggplot(dfdata,aes(MonthlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
monthlyRatePlot

numCompPlot <- ggplot(dfdata,aes(NumCompaniesWorked,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
numCompPlot

YearAtComPlot <- ggplot(dfdata,aes(YearsAtCompany,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearAtComPlot

YearInCurrPlot <- ggplot(dfdata,aes(YearsInCurrentRole,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
YearInCurrPlot

YearsSinceProm <- ggplot(dfdata,aes(YearsSinceLastPromotion,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
YearsSinceProm 

YearsCurrManPlot <- ggplot(dfdata,aes(YearsWithCurrManager,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearsCurrManPlot

myplot <- ggplot(dfdata, aes(BusinessTravel, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(Department, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(Education, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(EducationField, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(EnvironmentSatisfaction, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(Gender, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(JobInvolvement, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(JobLevel, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(JobSatisfaction, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(OverTime, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(RelationshipSatisfaction, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(StockOptionLevel, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(WorkLifeBalance, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(MaritalStatus, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(YearsInCurrentRole, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot #maybe

myplot <- ggplot(dfdata, aes(YearsWithCurrManager, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(YearsAtCompany, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important/maybe

myplot <- ggplot(dfdata, aes(NumCompaniesWorked, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(TrainingTimesLastYear, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(TotalWorkingYears, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(DistanceFromHome, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(Age, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(EducationField, group = JobSatisfaction )) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~JobSatisfaction)

myplot

Ecploratory plot for Montly income. This is with numerical values

ys <- names(dfdata)[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36)]
ys %>% map(function(y) 
  ggplot(dfdata , aes(MonthlyIncome)) + geom_point(aes_string(y=y)))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

#### More exploratory data for montly income with heatmap for numerical values

dfn <-dfdata[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36 )]

my_palette <- colorRampPalette(c("red", "white", "black"))
heatmapper <- function(df){
  df %>%
    keep(is.numeric) %>%
    tidyr::drop_na() %>%
    cor %>%
    heatmap.2(col = my_palette ,
              density.info = "none", trace = "none",
              dendogram = c("both"), symm = F,
              symkey = T, symbreaks = T, scale = "none",
              key = T)
}


heatmapper(dfn)

More exploratory data for montly income with corrplot for numerical values

correlator  <-  function(df){
  df %>%
    keep(is.numeric) %>%
    tidyr::drop_na() %>%
    cor %>%
    corrplot( addCoef.col = "white", number.digits = 2,
              number.cex = 0.5, method="square",
              order="hclust", title="Variable Corr Heatmap",
              tl.srt=45, tl.cex = 0.8)
}

correlator(dfn)

#### More exploratory data for montly income with ggplot numeric for numerical values

plotAllNumeric <- function(df){
  df%>%keep(is.numeric) %>%
    gather() %>%
    ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density()+geom_histogram() + theme_fivethirtyeight()
}


plotAllNumeric(dfn)

More exploratory data for montly income for categorical values

dfdata %>% keep(is.factor) %>% names -> label
ggplot(data = dfdata, aes(x = BusinessTravel, y = MonthlyIncome, fill =BusinessTravel )) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = Department, y = MonthlyIncome, fill = Department)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(Education), y = MonthlyIncome, fill = as.factor(Education))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important

ggplot(data = dfdata, aes(x = EducationField, y = MonthlyIncome, fill = EducationField)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(EnvironmentSatisfaction), y = MonthlyIncome, fill = as.factor(EnvironmentSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = Gender, y = MonthlyIncome, fill = Gender)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(JobInvolvement), y = MonthlyIncome, fill = as.factor(JobInvolvement))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(JobLevel), y = MonthlyIncome, fill = as.factor(JobLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important

ggplot(data = dfdata, aes(x = JobRole, y = MonthlyIncome, fill = JobRole)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()# important

ggplot(data = dfdata, aes(x = as.factor(JobSatisfaction), y = MonthlyIncome, fill = as.factor(JobSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = MaritalStatus, y = MonthlyIncome, fill = MaritalStatus)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = OverTime, y = MonthlyIncome, fill = OverTime)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(PerformanceRating), y = MonthlyIncome, fill = as.factor(PerformanceRating))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() #maybe

ggplot(data = dfdata, aes(x = as.factor(RelationshipSatisfaction), y = MonthlyIncome, fill = as.factor(RelationshipSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(StockOptionLevel), y = MonthlyIncome, fill = as.factor(StockOptionLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # maybe

ggplot(data = dfdata, aes(x = as.factor(WorkLifeBalance), y = MonthlyIncome, fill =as.factor(WorkLifeBalance))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

Cutting the variables for the dataframe that will be used in the model, witht the intention to have better performance of the model.

dfdata$YearsInCurrentRole<- cut(as.numeric(dfdata$YearsInCurrentRole), breaks = c(-1,1,100))
dfdata$TotalWorkingYears<- cut(as.numeric(dfdata$TotalWorkingYears), breaks = c(-1,1,100))
dfdata$YearsWithCurrManager<- cut(as.numeric(dfdata$YearsWithCurrManager), breaks = c(-1,1,100))
dfdata %>% filter(MaritalStatus %in% c("Single", "Divorce"))-> MaritalStatus
dfdata$MaritalStatus <- as.factor(dfdata$MaritalStatus)
levels(dfdata$MaritalStatus) <- c("NotMarried", "Married", "NotMarried")

Stting the dataframe for the model. After a lot of trial, this was the best model for attrition, we used Naive base model.

clasy<- dfdata[c("Attrition", "StockOptionLevel", "JobLevel", "MonthlyIncome", "OverTime")]

clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")] <- lapply(clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(clasy)

Split the train and test data

set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
  train_ind <- createDataPartition(df[[1]], p = p, list = list)
  cat("creating training dataset...\n")
  training <<- df[train_ind, ]
  cat("completed training dataset, creating test set\n")
  test <<- df[-train_ind, ]
  cat("done")
}

split(clasy)
## creating training dataset...
## completed training dataset, creating test set
## done

We run the Naive bayes model

library(doParallel)
numcores <- parallel::detectCores() - 1
cl <- makePSOCKcluster(numcores)
registerDoParallel(cl)


set.seed(3333)
trainMethod <- trainControl( method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)

fit.nb1 <- train(Attrition ~ .,  data = training, method = "nb", metric = "Spec", trControl = trainMethod)

fit.nb1
## Naive Bayes 
## 
## 653 samples
##   4 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times) 
## Summary of sample sizes: 627, 626, 627, 628, 626, 626, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  ROC        Sens       Spec  
##   FALSE      0.7232026  0.7445195  0.6584
##    TRUE      0.7668199  0.9992727  0.0000
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Spec was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE
##  and adjust = 1.
plot(fit.nb1)

Sensitivity : 0.7473

Specificity : 0.6857

Accuracy : 0.7373

test_pred <- predict(fit.nb1, newdata = test)
test_pred
##   [1] No  Yes No  Yes Yes No  No  No  No  Yes No  No  No  No  No  No  Yes
##  [18] No  Yes No  No  Yes Yes Yes Yes No  No  No  No  No  No  No  Yes No 
##  [35] No  No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [52] No  No  No  No  Yes No  No  Yes No  No  No  Yes No  No  No  Yes No 
##  [69] No  Yes No  No  No  No  Yes No  Yes Yes No  Yes No  No  No  Yes No 
##  [86] Yes No  Yes No  Yes No  No  No  Yes No  Yes No  No  No  No  Yes No 
## [103] Yes No  Yes No  No  No  Yes No  No  Yes No  No  Yes No  No  No  No 
## [120] No  Yes Yes Yes No  No  No  No  Yes No  No  Yes Yes Yes No  Yes Yes
## [137] Yes No  No  No  No  No  No  Yes No  No  No  No  Yes Yes Yes Yes No 
## [154] No  Yes No  Yes No  No  Yes No  No  No  No  No  No  No  Yes No  No 
## [171] Yes Yes No  Yes No  No  No  No  Yes Yes Yes No  Yes No  No  Yes Yes
## [188] Yes Yes No  No  No  No  No  No  Yes Yes No  No  Yes No  Yes No  Yes
## [205] No  No  Yes No  No  No  Yes No  No  No  No  No  No 
## Levels: No Yes
confusionMatrix(test_pred, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  136  11
##        Yes  46  24
##                                           
##                Accuracy : 0.7373          
##                  95% CI : (0.6735, 0.7946)
##     No Information Rate : 0.8387          
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.3084          
##                                           
##  Mcnemar's Test P-Value : 6.687e-06       
##                                           
##             Sensitivity : 0.7473          
##             Specificity : 0.6857          
##          Pos Pred Value : 0.9252          
##          Neg Pred Value : 0.3429          
##              Prevalence : 0.8387          
##          Detection Rate : 0.6267          
##    Detection Prevalence : 0.6774          
##       Balanced Accuracy : 0.7165          
##                                           
##        'Positive' Class : No              
## 
stopCluster(cl)

Setting the dataframe for monthly income, we will use regression.

dfdata<- readr::read_csv("CaseStudy2-data.csv")
reg <- dfdata[c("TotalWorkingYears", "YearsAtCompany", "Age", "Education", "JobLevel", "JobRole", "MonthlyIncome", "PerformanceRating", "StockOptionLevel" )]

reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)

reg

Train and test split

set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
  train_ind <- createDataPartition(df[[1]], p = p, list = list)
  cat("creating training dataset...\n")
  training <<- df[train_ind, ]
  cat("completed training dataset, creating test set\n")
  test <<- df[-train_ind, ]
  cat("done")
}

split(reg)
## creating training dataset...
## completed training dataset, creating test set
## done

After alot of trial and error and review of the visualization above we picked up the best model, which is the one below.

RMSE = 927.0692

regincome1 <- lm( MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole, data  = training)
summary(regincome1)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole, 
##     data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3085.0  -647.5   -97.3   638.3  4284.1 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3642.521    220.489  16.520  < 2e-16 ***
## TotalWorkingYears                53.411      9.134   5.847 7.96e-09 ***
## JobLevel2                      1598.495    170.893   9.354  < 2e-16 ***
## JobLevel3                      4650.058    226.324  20.546  < 2e-16 ***
## JobLevel4                      7872.592    345.320  22.798  < 2e-16 ***
## JobLevel5                     10669.268    394.495  27.045  < 2e-16 ***
## JobRoleHuman Resources        -1242.551    320.507  -3.877 0.000117 ***
## JobRoleLaboratory Technician  -1309.554    212.884  -6.151 1.35e-09 ***
## JobRoleManager                 3557.429    293.615  12.116  < 2e-16 ***
## JobRoleManufacturing Director   239.931    187.576   1.279 0.201319    
## JobRoleResearch Director       3701.321    252.276  14.672  < 2e-16 ***
## JobRoleResearch Scientist     -1091.211    216.239  -5.046 5.87e-07 ***
## JobRoleSales Executive            8.403    162.191   0.052 0.958697    
## JobRoleSales Representative   -1399.737    261.799  -5.347 1.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1036 on 641 degrees of freedom
## Multiple R-squared:  0.9504, Adjusted R-squared:  0.9494 
## F-statistic:   944 on 13 and 641 DF,  p-value: < 2.2e-16
pred <- predict(regincome1, newdata = test)
str(pred)
##  Named num [1:215] 9067 9049 2872 9013 5668 ...
##  - attr(*, "names")= chr [1:215] "1" "2" "3" "4" ...
ASA2 <- mean((pred[1:nrow(test)] - test$MonthlyIncome)^2)
sqrt(ASA2)
## [1] 927.0692

AIC and BIC

AIC(regincome1)
## [1] 10970.58
BIC(regincome1)
## [1] 11037.85

We run our model against the test data

pred2 <- predict(regincome1, newdata = test)
pred2
##         1         2         3         4         5         6         7 
##  9066.617  9048.732  2871.775  9013.207  5668.301  2560.202  2934.077 
##         8         9        10        11        12        13        14 
##  5730.115  5908.232  5854.821  5775.123  5783.526  2867.074  9102.143 
##        15        16        17        18        19        20        21 
##  6103.990 19792.003  8781.679  6068.464  2867.074  2403.017  8835.089 
##        22        23        24        25        26        27        28 
##  8835.089  9529.428  2386.378  6015.054 13357.364  6317.632  5775.123 
##        29        30        31        32        33        34        35 
## 16300.988  6148.998  8995.321  5569.883  2818.364  2764.953  6095.587 
##        36        37        38        39        40        41        42 
##  4465.569  6015.054  5218.019 12858.783  4412.158  2760.253 13517.596 
##        43        44        45        46        47        48        49 
##  5801.411  2386.378  4683.912 16658.522  2658.132  8781.679  9013.207 
##        50        51        52        53        54        55        56 
## 19508.610  2867.074  9574.436  2386.378 12698.551  2978.596 18990.843 
##        57        58        59        60        61        62        63 
##  2818.364  2604.721 13117.433  6015.054 12858.783  6103.990 19151.075 
##        64        65        66        67        68        69        70 
## 13008.704  2563.249  9120.028  5890.347  2604.721 12848.472  5676.704 
##        71        72        73        74        75        76        77 
##  2653.431 16071.005  2386.378  5783.526  4321.975  5836.936  2658.132 
##        78        79        80        81        82        83        84 
##  2813.664  5569.883  4732.622  4518.980 12741.650  2600.021  2871.775 
##        85        86        87        88        89        90        91 
##  6015.054  5454.659  5164.608 16300.988  9048.732 19471.539  5676.704 
##        92        93        94        95        96        97        98 
## 12645.140  5801.411  5004.376  5668.301  9529.428  6050.579  2818.364 
##        99       100       101       102       103       104       105 
## 19188.146  4732.622  2764.953  4683.912 16925.576  5997.168  2706.842 
##       106       107       108       109       110       111       112 
## 16728.273  8835.089  2973.896  4950.965  2546.610  5775.123  2827.256 
##       113       114       115       116       117       118       119 
##  2871.775  5783.526  4465.569  2871.775  5676.704  2978.596  2296.195 
##       120       121       122       123       124       125       126 
##  8986.918  2403.017  2706.842  2600.021  2925.185  5561.480  2871.775 
##       127       128       129       130       131       132       133 
##  9360.793  2604.721  6228.696  3085.417  8531.936  2720.434  4532.572 
##       134       135       136       137       138       139       140 
##  2546.610  2706.842  2871.775  5783.526  5997.168  2604.721  5569.883 
##       141       142       143       144       145       146       147 
##  2706.842  5561.480  5057.787  2776.891  2349.606 16925.576  5730.115 
##       148       149       150       151       152       153       154 
##  8773.276  7530.725  2760.253 12634.829  9760.956  9048.732  6264.222 
##       155       156       157       158       159       160       161 
##  2439.789  8781.679  2818.364 12651.169  5801.411  4465.569  8995.321 
##       162       163       164       165       166       167       168 
##  2493.199  4465.569 17139.219  5569.883  3085.417  2818.364  6442.339 
##       169       170       171       172       173       174       175 
##  2925.185  4532.572  2711.543  2403.017  4897.555  4683.912  9066.617 
##       176       177       178       179       180       181       182 
##  5961.643  2760.253  2818.364  2386.378  2453.381 12384.115  5988.765 
##       183       184       185       186       187       188       189 
##  2764.953  2349.606 16461.220  6264.222  4465.569  3085.417  6050.579 
##       190       191       192       193       194       195       196 
##  9360.793  2764.953  2773.845  5463.061 15697.130 12651.169 19829.074 
##       197       198       199       200       201       202       203 
##  3032.007 16194.166  4839.444  9093.740  5569.883  2658.132  2706.842 
##       204       205       206       207       208       209       210 
##  2439.789  2813.664 19578.360  2332.967  5569.883  4465.569  2386.378 
##       211       212       213       214       215 
##  5463.061  2867.074  5694.589 13019.015  5836.936
noinc <- read_csv("CaseStudy2CompSet No Salary (2).csv")
noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)
noinc
preds <-predict(regincome1, newdata =  noinc)
preddf <- data.frame(predicted = preds, ID = noinc$ID)
preddf <- preddf %>% arrange(ID)
write.csv(preddf, "Case2PredictionsSavorgnanSalary.csv")
library(readr)
noatt <- read_csv("CaseStudy2CompSet No Attrition.csv")
noatt[c("StockOptionLevel", "JobLevel", "OverTime")] <- lapply(noatt[c("StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(noatt)
test_preda <- predict(fit.nb1, newdata = noatt)
test_preda
##   [1] No  No  No  No  No  No  No  Yes No  No  Yes Yes No  No  Yes No  No 
##  [18] No  No  No  Yes No  Yes No  No  No  No  Yes No  Yes Yes No  No  No 
##  [35] Yes No  No  No  No  No  Yes Yes No  No  Yes No  No  No  Yes No  No 
##  [52] Yes No  No  No  No  Yes No  No  Yes No  No  Yes No  No  No  No  No 
##  [69] No  Yes No  No  No  No  No  Yes No  Yes Yes No  No  No  No  No  No 
##  [86] No  No  No  No  No  No  No  Yes No  No  No  No  No  Yes Yes No  No 
## [103] Yes No  No  No  No  No  Yes No  No  No  No  No  No  No  Yes Yes Yes
## [120] No  No  Yes No  No  No  No  Yes No  No  No  No  No  No  No  No  No 
## [137] No  No  Yes No  No  No  Yes Yes Yes No  No  Yes Yes No  No  No  No 
## [154] No  Yes No  Yes Yes Yes No  No  No  Yes No  No  Yes No  No  Yes No 
## [171] Yes Yes No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No 
## [188] Yes No  No  Yes No  No  No  Yes Yes Yes No  No  No  No  No  Yes No 
## [205] No  Yes No  No  No  No  Yes No  Yes No  No  Yes No  No  No  No  Yes
## [222] No  No  No  No  Yes Yes No  Yes No  Yes Yes No  Yes Yes Yes Yes No 
## [239] No  No  Yes Yes No  Yes No  No  Yes Yes No  Yes Yes No  No  Yes No 
## [256] No  No  Yes Yes No  No  No  No  No  No  No  No  No  Yes No  No  No 
## [273] No  Yes Yes Yes No  No  No  Yes No  No  No  No  No  Yes Yes Yes Yes
## [290] Yes No  No  No  No  Yes Yes No  Yes No  No 
## Levels: No Yes
predi <- data.frame(predicted = test_preda, ID = noatt$ID)
predited <- predi %>% arrange(ID)
write.csv(predited,"Case2PredictionsSavorgnanAttrition.csv")